Scenario 1: sampling minibatches from fully observed datasets

To perform online iNMF, we need to install the online branch. The mergeH5 function enables users to merge multiple HDF5 outputs from 10x Cell Ranger. Please see the instruction below.

library(devtools)
install_github("MacoskoLab/liger", ref = "online")

We first create a liger object by passing the filenames of HDF5 files containing the raw count data. The data can be downloaded here.

library(liger)
pbmcs = createLiger(list(stim = "pbmcs_stim.h5", ctrl = "pbmcs_ctrl.h5"))

We then perform the normalization, gene selection, and gene scaling in an online fashion, reading the data from disk in small batches.

pbmcs = normalize(pbmcs)
pbmcs = selectGenes(pbmcs, var.thresh = 0.2, do.plot = F)
pbmcs = scaleNotCenter(pbmcs)

Online Integrative Nonnegative Matrix Factorization

Now we can use online iNMF to factorize the data, again using only minibatches that we read from the HDF5 files on demand (default mini-batch size = 5000).

pbmcs = online_iNMF(pbmcs, k = 20, max.epochs = 5)

Quantile Normalization and Downstream Analysis

After performing the factorization, we can perform quantile normalization to align the datasets.

pbmcs = quantile_norm(pbmcs)

We can also visualize the cell factor loadings in two dimensions using t-SNE or UMAP.

pbmcs = runUMAP(pbmcs)
plotByDatasetAndCluster(pbmcs, axis.labels = c("UMAP1","UMAP2"))

We can also compare clusters or datasets (within each cluster) to identify differentially expressed genes. The runWilcoxon function performs differential expression analysis by sampling a specified number of cells from the dataset on disk, then performing an in-memory Wilcoxon rank-sum test on this subset. Thus, users can still analyze large datasets with a fixed amount of memory.

de_genes = runWilcoxon(pbmcs, compare.method = "clusters", max.sample = 5000)
de_genes = de_genes[order(de_genes$padj), ]
head(de_genes[de_genes$group == 1,], n = 10)
##         feature group    avgExpr      logFC statistic       auc          pval
## 12676     CD79A     1 -13.081037   9.221907 1171215.0 0.7843453 4.811752e-284
## 7663      MS4A1     1 -16.142007   6.461321 1046510.0 0.7008322 2.574541e-214
## 3988       CD74     1  -4.824997   6.827074 1368652.5 0.9165663 4.188542e-141
## 7224       BLNK     1 -18.796213   3.917751  932647.0 0.6245799 4.024384e-118
## 9509      TCL1A     1 -20.262259   2.658730  870528.0 0.5829797 8.451992e-108
## 6527      ANXA1     1 -21.082136 -11.432903  214735.0 0.1438048 2.004959e-102
## 10587      IRF8     1 -15.494765   6.110693 1042553.5 0.6981826 9.547656e-102
## 9003  KIAA0226L     1 -19.874406   2.965710  885999.5 0.5933407  2.185971e-99
## 4923    TSPAN13     1 -19.932755   2.923817  879879.5 0.5892422  2.775645e-97
## 3251        IGJ     1 -20.463715   2.454131  858873.5 0.5751748  5.494269e-93
##                padj pct_in pct_out
## 12676 6.761955e-280    100     100
## 7663  1.809001e-210    100     100
## 3988  1.962053e-137    100     100
## 7224  1.413867e-114    100     100
## 9509  2.375517e-104    100     100
## 6527   4.695949e-99    100     100
## 10587  1.916760e-98    100     100
## 9003   3.839931e-96    100     100
## 4923   4.334016e-94    100     100
## 3251   7.721096e-90    100     100

The online algorithm can be implemented on datasets loaded in memory as well. The same analysis is performed on the PBMCs, shown below.

stim = readRDS("pbmcs_stim.RDS")
ctrl = readRDS("pbmcs_ctrl.RDS")
pbmcs_mem = createLiger(list(stim = stim, ctrl = ctrl), remove.missing = F)
pbmcs_mem = normalize(pbmcs_mem)
pbmcs_mem = selectGenes(pbmcs_mem, var.thresh = 0.2, do.plot = F)
pbmcs_mem = scaleNotCenter(pbmcs_mem)
pbmcs_mem = online_iNMF(pbmcs_mem, k = 20, max.epochs = 5)
pbmcs_mem = quantile_norm(pbmcs_mem)
pbmcs_mem = runUMAP(pbmcs_mem)
plotByDatasetAndCluster(pbmcs_mem, axis.labels = c("UMAP1","UMAP2"))

Moreover, the package allows the user to merge multiple HDF5 files. The code snippet below shows how it works (only for demonstration).

mergeH5(file.list = list("allen_smarter_cells.h5", "allen_smarter_nuclei.h5"), 
        library.names = c("cells","nuclei"),
        new.filename = "cells_nuclei.h5")

Scenario 2: iterative refinement by incorporating new datasets

We can also perform online iNMF with continually arriving datasets.

MOp = createLiger(list(cells = "allen_smarter_cells.h5"))
MOp = normalize(MOp)
MOp = selectGenes(MOp, var.thresh = 2)
MOp.vargenes = MOp@var.genes
MOp = scaleNotCenter(MOp)
MOp = online_iNMF(MOp, k = 40, max.epochs = 1)
MOp = quantile_norm(MOp)
MOp = runUMAP(MOp)
plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))

MOp2 = createLiger(list(nuclei = "allen_smarter_nuclei.h5"))
MOp2 = normalize(MOp2)
MOp2@var.genes = MOp@var.genes
MOp2 = scaleNotCenter(MOp2)
MOp = online_iNMF(MOp, X_new = list(nuclei = MOp2), k = 40, max.epochs = 1)
MOp = quantile_norm(MOp)
MOp = runUMAP(MOp)
plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))

Scenario 3: projecting new datasets

MOp = createLiger(list(cells = "allen_smarter_cells.h5"))
MOp@var.genes = MOp.vargenes
MOp = online_iNMF(MOp, k = 40, max.epochs = 1)
MOp = quantile_norm(MOp)
MOp = runUMAP(MOp)
plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))

MOp = online_iNMF(MOp, X_new = list(nuclei = MOp2), k = 40, project = TRUE)
MOp = quantile_norm(MOp)
MOp = runUMAP(MOp)
plotByDatasetAndCluster(MOp, axis.labels = c("UMAP1","UMAP2"))